Murine species trees & rates
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
#library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
source("C:/bin/core/corelib/design.r")
source("C:/bin/core/get_tree_info.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")1 Full coding species tree
#tree_type = "astral"
save_tree_fig = F
if(tree_type == "astral"){
cat("188 species.\n11,775 coding loci.\nGene trees inferred with IQtree.\nSpecies tree inferred with ASTRAL (no branch lengths shown).\n")
}else if(tree_type == "concat"){
cat("188 species.\n11,775 coding loci.\nGene trees inferred with IQtree.\nSpecies tree inferred by concatenation of all loci with IQtree.\n")
}## 188 species.
## 11,775 coding loci.
## Gene trees inferred with IQtree.
## Species tree inferred with ASTRAL (no branch lengths shown).
tree_file = "../../data/trees/full_coding_iqtree_astral.cf.rooted.tree"
astral_tree = read.tree(tree_file)
tree_to_df_list = treeToDF(astral_tree)
tree_info_astral = tree_to_df_list[["info"]]
#tree_info_astral = treeToDF(astral_tree)
tree_info_astral = tree_info_astral %>% separate(label, c("astral", "gcf", "scf"), sep="/", remove=F)
tree_info_astral$astral[tree_info_astral$node.type=="tip"] = NA
tree_info_astral$astral = as.numeric(tree_info_astral$astral)
tree_info_astral$gcf = as.numeric(tree_info_astral$gcf)
tree_info_astral$scf = as.numeric(tree_info_astral$scf)
# Read astral tree data
concat_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
concat_tree = read.tree(concat_file)
tree_to_df_list = treeToDF(concat_tree)
tree_info_concat = tree_to_df_list[["info"]]
#tree_info_concat = treeToDF(concat_tree)
tree_info_concat = tree_info_concat %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
tree_info_concat$bootstrap[tree_info_concat$node.type=="tip"] = NA
tree_info_concat$bootstrap = as.numeric(tree_info_concat$bootstrap)
tree_info_concat$gcf = as.numeric(tree_info_concat$gcf)
tree_info_concat$scf = as.numeric(tree_info_concat$scf)
# Read concat tree data
rf = RF.dist(concat_tree, astral_tree)
nrf = RF.dist(concat_tree, astral_tree, normalize=T)
#treecomp = comparePhylo(concat_tree, astral_tree, plot=T)
# Stores node ids of common clades between trees!
#write.csv(tree_info, "../../data/trees/full-coding-concat-cf-rooted.csv", row.names=F)
if(tree_type == "astral"){
tree_info = tree_info_astral
rodent_tree = astral_tree
xmax = 31
}else if(tree_type == "concat"){
tree_info = tree_info_concat
rodent_tree = concat_tree
xmax = 0.125
}1.0.1 Species tree with branches colored by gene concordance factor
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
gcf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(gcf_tree)if(save_tree_fig){
gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree.pdf", sep="")
ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# gCF tree1.0.2 Species tree with branches colored by site concordance factor
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
scf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(scf_tree)#ggsave("../data/trees/scf-tree.pdf", scf_tree, width=8, height=16, unit="in")
# sCF tree1.0.3 Gene vs site concordance factors colored by branch support
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
if(tree_type == "astral"){
p = ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) +
geom_point() +
scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1))
}else if(tree_type == "concat"){
p = ggplot(tree_info, aes(x=gcf, y=scf, color=bootstrap)) +
geom_point() +
scale_color_continuous(name='Bootstrap', low=l, high=h, limits=c(0,100))
}
p = p + bartheme() +
theme(legend.title=element_text(size=12))
print(p)2 Gene trees
The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.
2.0.1 Distribution of taxa per gene tree
gt_file = "../../data/trees/loci.treefile"
gt = read.tree(gt_file)
gt_data = data.frame("rfs"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
for(i in 1:length(gt)){
cur_tips = gt[[i]]$tip.label
pruned_tree = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
cur_rf = RF.dist(pruned_tree, gt[[i]], normalize=T)
if(cur_rf==0){
gt_data = rbind(gt_data, data.frame("rfs"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
}else{
gt_data = rbind(gt_data, data.frame("rfs"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
}
}
p = ggplot(gt_data, aes(x=num.tips)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("# of taxa") +
ylab("# of gene trees") +
bartheme()
print(p)2.0.2 Tree distance (RF) distribution between gene trees and species tree
Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.
p = ggplot(gt_data, aes(x=rfs)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("Normalized RF") +
ylab("# of gene trees") +
bartheme()
print(p)3 Average rates per gene with basic model fit (mg94-local)
We ran each of the 11,775 coding loci through HyPhy’s standard MG94 fit with the -local option to estimate a rate for each branch in the input tree.
For input trees we used the gene tree estimated from each individual alignment to reduce false inferences of substitutions that result from tree misspecification.
mg94_local_file = "../../data/rates/full-coding-mg94-local.csv.gz"
#mg94_local_file = "../../data/rates/full-coding-mg94-local.csv"
mg94_local = vroom(mg94_local_file, comment="#")
mg94_local_gene = group_by(mg94_local, file) %>% summarize(dn=mean(dn, na.rm=T), ds=mean(ds, na.rm=T), dn.ds=mean("dn/ds", na.rm=T), nonsynonymous.bl=mean("nonsynonymous bl", na.rm=T), synonymous.bl=mean("synonymous bl", na.rm=T))
p = ggplot(subset(mg94_local_gene, ds < 0.05), aes(x=ds, y=dn)) +
geom_point(size=2, alpha=0.2, color="#333333") +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dS per gene") +
ylab("Avg. dN per gene") +
bartheme()
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666")
print(p)ds_p = ggplot(subset(mg94_local_gene, ds < 0.05), aes(x=ds)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
bartheme()
print(ds_p)
# Distribution of dS when using concatenated treedn_p = ggplot(mg94_local_gene, aes(x=dn)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("dN") +
ylab("# of genes") +
bartheme()
print(dn_p)
# Distribution of dS when using gene treesds_filter_level = quantile(mg94_local_gene$ds, 0.98)
#ds_filter_level = 0.03
ds_filter = subset(mg94_local_gene, ds > ds_filter_level)$file
# Get a list of genes to filter out in subsequent analyses based on dS
print(paste("Removing", length(ds_filter), "genes with dS above", ds_filter_level, "from subsequent analyses."))## [1] "Removing 236 genes with dS above 0.0286355420964189 from subsequent analyses."
#write.csv(ds_filter, file="../../data/rates/full-coding-mg94-local-ds-filter-0.95quant.csv", row.names=F)4 Average rates per branch with basic model fit (mg94-local)
Because we used the gene trees, in order to quantify rates on species tree branches we first needed to check whether branches in the species tree exist in a given gene tree.
This resulted in three categories for a species tree branch in a given gene tree:
- Full clade: All species in the clade that descends from the branch in the species tree are present as a monophyletic split in the gene tree.
- Parital clade: Not all species in the clade that descends from the branch in the species tree are present in the gene tree, but the ones that are present form a monophyletic split.
- Discordant/missing clade: Either the species in the clade that descends from the branch in the species tree do not form a monophyletic split in this gene tree (discordant) or ALL species in this clade are missing from this gene tree (missing)
Average rates are then calculated as (for dS, for example):
\[\frac{\sum_{i=1}^{n}\text{branch dS}_i}{n}\]
Where:
- \(n = \text{# full clade genes} + \text{# partial clade genes}\) for this branch
4.0.1 Species tree branch presence/absence per gene
if(tree_type == "astral"){
tree_rates = read.csv("../../data/rates/full-coding-astral-cf-rooted-rates.csv", header=T)
}else if(tree_type == "concat"){
tree_rates = read.csv("../../data/rates/full-coding-concat-cf-rooted-rates.csv", header=T)
}
# Read the branch average rate data
full_clade = select(tree_rates, clade, node.type, num.genes.full)
full_clade$label = "Full clade"
names(full_clade)[3] = "num.genes"
partial_clade = select(tree_rates, clade, node.type, num.genes.partial)
partial_clade$label = "Partial clade"
names(partial_clade)[3] = "num.genes"
no_clade = select(tree_rates, clade, node.type, num.genes.no.clade)
no_clade$label = "Discordant/missing clade"
names(no_clade)[3] = "num.genes"
clade_counts = rbind(full_clade, partial_clade, no_clade)
# Convert branch categories to long format
clade_counts$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Discordant/missing clade"))
branch_counts = ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)These measures are highly correlated with gene concordance factors:
4.0.2 Correlation between gCF and the % of genes that contain the descending clade for each species tree branch
tree_rates$clade.perc = (tree_rates$num.genes.full + tree_rates$num.genes.partial) / (tree_rates$num.genes.full + tree_rates$num.genes.partial + tree_rates$num.genes.no.clade)
p = ggplot(tree_rates, aes(x=gcf, y=clade.perc)) +
geom_point(size=2, alpha=0.4, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color="#920000") +
xlab("gCF") +
ylab("% of genes with clade present") +
bartheme() +
theme(legend.position="none")
print(p)4.0.3 This results in the following distributions for average rates across branches:
p = ggplot(subset(tree_rates, node.type!="ROOT"), aes(x=avg.ds, y=avg.dn, color=node.type)) +
geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dS per branch") +
ylab("Avg. dN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)tree_rates$ds.outlier = ifelse(tree_rates$avg.ds>0.2,tree_rates$node,'')
tree_rates$dn.outlier = ifelse(tree_rates$avg.dn>0.01,tree_rates$node,'')4.0.4 The tree with branches colored by dS and outliers labeled:
h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)
rate_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_rates$avg.ds)) +
scale_color_continuous(name='Avg. dS', low=l, high=h, limits=c(0,0.9)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_rates$avg.ds>0.2,as.character(node),'')), label.size=NA, fill="transparent")
#geom_text(aes(label=node), hjust=-.1, color="#006ddb")
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)4.0.5 The tree with branches colored by dN and outliers labeled:
h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)
rate_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_rates$avg.dn)) +
scale_color_continuous(name='Avg. dN', low=l, high=h, limits=c(0,0.015)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_rates$avg.dn>0.01,as.character(node),'')), label.size=NA, fill="transparent")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)5 Rates and discordance
5.0.1 dS vs. concordance factors
Only branches with avg. dS < 0.05
ds_gcf_p = ggplot(subset(tree_rates, node.type!="ROOT" & avg.ds < 0.05), aes(x=avg.ds, y=gcf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dS per branch") +
ylab("gCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
ds_gcf_p = ggExtra::ggMarginal(ds_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
ds_scf_p = ggplot(subset(tree_rates, node.type!="ROOT" & avg.ds < 0.05), aes(x=avg.ds, y=scf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dS per branch") +
ylab("sCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
ds_scf_p = ggExtra::ggMarginal(ds_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
p = plot_grid(ds_gcf_p, ds_scf_p, ncol=2)
print(p)5.0.2 dN vs. concordance factors
Only branches with avg. dN < 0.01
dn_gcf_p = ggplot(subset(tree_rates, node.type!="ROOT" & avg.dn < 0.01), aes(x=avg.dn, y=gcf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dN per branch") +
ylab("gCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
dn_gcf_p = ggExtra::ggMarginal(dn_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
dn_scf_p = ggplot(subset(tree_rates, node.type!="ROOT" & avg.dn < 0.01), aes(x=avg.dn, y=scf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dN per branch") +
ylab("sCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
dn_scf_p = ggExtra::ggMarginal(dn_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
p = plot_grid(dn_gcf_p, dn_scf_p, ncol=2)
print(p)5.0.3 dN/dS vs. concordance factors
Only branches with avg. dN/dS < 0.5
dnds_gcf_p = ggplot(subset(tree_rates, node.type!="ROOT" & avg.dn.ds < 0.5), aes(x=avg.dn.ds, y=gcf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dN/dS per branch") +
ylab("gCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
dnds_gcf_p = ggExtra::ggMarginal(dnds_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
dnds_scf_p = ggplot(subset(tree_rates, node.type!="ROOT" & avg.dn.ds < 0.5), aes(x=avg.dn.ds, y=scf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dN/dS per branch") +
ylab("sCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
dnds_scf_p = ggExtra::ggMarginal(dnds_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
p = plot_grid(dnds_gcf_p, dnds_scf_p, ncol=2)
print(p)6 Basic rate and phenotype correlations with tip branches
pheno = read.csv("../../data/phenotype-data/combined-phenotype-data.csv", header=T, comment.char="#")
tips = subset(tree_rates, node.type=="tip")
names(tips)[2] = "sample"
pheno_rates = merge(pheno, tips, by="sample")
pheno_rates = select(pheno_rates, sample, Adult_Mass.g., Total_Length.mm., Head.Body_Length.mm., Tail_Length.mm., Hind_Foot_Length.mm., Relative_Tail_Length, Relative_Hind_Foot_Length, avg.dn, avg.ds, avg.dn.ds)
pheno_rates_long = melt(pheno_rates, id.vars=c("sample", "avg.dn", "avg.ds", "avg.dn.ds"))
#pheno_rates_long = gather(pheno_rates, sample, value, Adult_Mass.g., Total_Length.mm., Head.Body_Length.mm., Tail_Length.mm., Hind_Foot_Length.mm., Relative_Tail_Length, Relative_Hind_Foot_Length)6.0.1 Avg dS per tip vs phenotype
p = ggplot(pheno_rates_long, aes(x=value, y=avg.ds)) +
geom_point(size=2, alpha=0.2, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
xlab("Avg. trait value per tip") +
ylab("Avg. dS per tip") +
facet_wrap(~variable, scales="free_x") +
bartheme()
print(p)6.0.2 Avg dN per tip vs phenotype
p = ggplot(pheno_rates_long, aes(x=value, y=avg.dn)) +
geom_point(size=2, alpha=0.2, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
xlab("Avg. trait value per tip") +
ylab("Avg. dN per tip") +
facet_wrap(~variable, scales="free_x") +
bartheme()
print(p)